In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
%matplotlib inline
# Profiling Libraries
from ydata_profiling import ProfileReport
In [2]:
uc= pd.read_csv(r"C:\Users\acer\Desktop\4th  Semester\DBMS Lab\SQL Unicorn Project\CSV Files\Unicorn_Companies.csv")
companies = pd.read_csv(r"C:\Users\acer\Desktop\4th  Semester\DBMS Lab\SQL Unicorn Project\CSV Files\Companies.csv")
dates = pd.read_csv(r"C:\Users\acer\Desktop\4th  Semester\DBMS Lab\SQL Unicorn Project\CSV Files\Dates.csv")
fundings = pd.read_csv(r"C:\Users\acer\Desktop\4th  Semester\DBMS Lab\SQL Unicorn Project\CSV Files\Fundings.csv")
industry = pd.read_csv(r"C:\Users\acer\Desktop\4th  Semester\DBMS Lab\SQL Unicorn Project\CSV Files\Industry.csv")
In [13]:
ProfileReport(uc, title="Unicorn Companies Profiling Report", explorative=True)
Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]
Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]
Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]
Out[13]:

In [12]:
uc.head()
Out[12]:
ID Company Date Joined Industry City Country Continent Valuation Funding Select Investors Year Founded
0 1 Bytedance 4/7/2017 Artificial intelligence Beijing China Asia $180B $8B Sequoia Capital China, SIG Asia Investments, S... 2012
1 2 SpaceX 12/1/2012 Other Hawthorne United States North America $100B $7B Founders Fund, Draper Fisher Jurvetson, Rothen... 2002
2 3 SHEIN 7/3/2018 E-commerce & direct-to-consumer Shenzhen China Asia $100B $2B Tiger Global Management, Sequoia Capital China... 2008
3 4 Stripe 1/23/2014 Fintech San Francisco United States North America $95B $2B Khosla Ventures, LowercaseCapital, capitalG 2010
4 5 Klarna 12/12/2011 Fintech Stockholm Sweden Europe $46B $4B Institutional Venture Partners, Sequoia Capita... 2005
In [5]:
uc.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1074 entries, 0 to 1073
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   ID                1074 non-null   int64 
 1   Company           1074 non-null   object
 2   Date Joined       1074 non-null   object
 3   Industry          1074 non-null   object
 4   City              1059 non-null   object
 5   Country           1074 non-null   object
 6   Continent         1074 non-null   object
 7   Year Founded      1074 non-null   int64 
 8   Funding           1074 non-null   object
 9   Select Investors  1073 non-null   object
 10  Total Raised      1074 non-null   object
 11  Valuation         1074 non-null   object
dtypes: int64(2), object(10)
memory usage: 100.8+ KB
In [59]:
uc.describe()
Out[59]:
ID Year Founded
count 1074.000000 1074.000000
mean 537.500000 2013.231844
std 310.181399 4.129630
min 1.000000 2003.500000
25% 269.250000 2011.000000
50% 537.500000 2014.000000
75% 805.750000 2016.000000
max 1074.000000 2021.000000
In [60]:
uc.describecribe(include= ['object'])
Out[60]:
Company Date Joined Industry City Country Continent Funding Select Investors Total Raised Valuation
count 1074 1074 1074 1074 1074 1074 1074 1074 1074 1074
unique 1073 639 16 258 46 6 539 1059 914 30
top Bolt 7/13/2021 Fintech San Francisco United States North America $1B Sequoia Capital None $1B
freq 2 9 224 152 562 589 60 3 24 471
In [28]:
uc.isnull().sum()
Out[28]:
ID                   0
Company              0
Date Joined          0
Industry             0
City                15
Country              0
Continent            0
Year Founded         0
Funding              0
Select Investors     1
Total Raised         0
Valuation            0
dtype: int64
In [29]:
uc.shape
Out[29]:
(1074, 12)
In [34]:
data_dup = uc.duplicated().any()
print(data_dup)
False
In [37]:
dict = {}
for i in list(uc.columns):
    dict[i] = uc[i].value_counts().shape[0]

pd.DataFrame(dict,index=["unique count"]).transpose()
Out[37]:
unique count
ID 1074
Company 1073
Date Joined 639
Industry 16
City 258
Country 46
Continent 6
Year Founded 35
Funding 539
Select Investors 1059
Total Raised 914
Valuation 30
In [3]:
continuous_values = []
categorical_values = []

for column in uc.columns:
    if uc[column].dtype == 'int64' or uc[column].dtype == 'float64':
        continuous_values.append(column)
    else:
        categorical_values.append(column)
In [15]:
categorical_values
Out[15]:
['Company',
 'Date Joined',
 'Industry',
 'City',
 'Country',
 'Continent',
 'Valuation',
 'Funding',
 'Select Investors']
In [41]:
continuous_values
Out[41]:
['ID', 'Year Founded']
In [16]:
uc.Continent.unique()
Out[16]:
array(['Asia', 'North America', 'Europe', 'Oceania', 'South America',
       'Africa'], dtype=object)
In [7]:
uc.Country.unique().tolist()
Out[7]:
['China',
 'United States',
 'Sweden',
 'Australia',
 'United Kingdom',
 'Bahamas',
 'India',
 'Indonesia',
 'Turkey',
 'Estonia',
 'Germany',
 'Hong Kong',
 'South Korea',
 'Mexico',
 'Canada',
 'Netherlands',
 'France',
 'Finland',
 'Israel',
 'Lithuania',
 'Denmark',
 'Belgium',
 'Colombia',
 'Brazil',
 'Singapore',
 'Austria',
 'Ireland',
 'United Arab Emirates',
 'Switzerland',
 'Vietnam',
 'South Africa',
 'Thailand',
 'Norway',
 'Chile',
 'Argentina',
 'Bermuda',
 'Japan',
 'Spain',
 'Malaysia',
 'Senegal',
 'Philippines',
 'Luxembourg',
 'Nigeria',
 'Czech Republic',
 'Croatia',
 'Italy']
In [8]:
uc.Industry.unique().tolist()
Out[8]:
['Artificial intelligence',
 'Other',
 'E-commerce & direct-to-consumer',
 'Fintech',
 'Internet software & services',
 'Supply chain, logistics, & delivery',
 'Consumer & retail',
 'Data management & analytics',
 'Edtech',
 'Health',
 'Hardware',
 'Auto & transportation',
 'Travel',
 'Cybersecurity',
 'Mobile & telecommunications',
 'Artificial Intelligence']
In [4]:
# Convert the 'Date Joined' column from a 'string' to a 'Datetime'
uc['Date Joined'] = pd.to_datetime(uc['Date Joined'])
In [5]:
uc.dtypes
Out[5]:
ID                           int64
Company                     object
Date Joined         datetime64[ns]
Industry                    object
City                        object
Country                     object
Continent                   object
Valuation                   object
Funding                     object
Select Investors            object
Year Founded                 int64
dtype: object
In [6]:
# Replace the "Artificial intelligence" in **Industry** column with "Artificial Intelligence"
uc['Industry'] = uc['Industry'].replace('Artificial intelligence', 'Artificial Intelligence')
In [7]:
uc['City'].fillna('Unknown',inplace = True)
# fill the missing value in 'Select Investors' with 'Unknown'
uc['Select Investors'] = uc['Select Investors'].fillna('Unknown')
uc.isnull().sum()
Out[7]:
ID                  0
Company             0
Date Joined         0
Industry            0
City                0
Country             0
Continent           0
Valuation           0
Funding             0
Select Investors    0
Year Founded        0
dtype: int64
In [8]:
# Replace the string "Unknown" in Funding with "0"
uc['Funding'] = uc['Funding'].replace('Unknown', '0')
In [9]:
# find the row column row data string 'Unknown' in the 'Funding' column
uc[uc['Funding'] == 'Unknown'].head(3)
Out[9]:
ID Company Date Joined Industry City Country Continent Valuation Funding Select Investors Year Founded
In [10]:
uc[uc['Funding'] == '0'].head(3)
Out[10]:
ID Company Date Joined Industry City Country Continent Valuation Funding Select Investors Year Founded
215 216 SSENSE 2021-06-08 E-commerce & direct-to-consumer Montreal Canada North America $4B 0 Sequoia Capital 2003
424 425 Uplight 2021-03-03 Other Boulder United States North America $2B 0 Rubicon Technology Partners, Max Ventures, Inc... 2019
567 568 ISN 2020-12-17 Supply chain, logistics, & delivery Dallas United States North America $2B 0 Blackstone 2001
In [27]:
selected_rows = uc.loc[uc['Country'] == 'Germany'].head(3)
selected_rows
Out[27]:
ID Company Date Joined Industry City Country Continent Valuation Funding Select Investors Year Founded
45 46 Celonis 2018-06-26 Data management & analytics Munich Germany Europe $11B $1B Accel, 83North 2011
67 68 N26 2019-01-10 Fintech Berlin Germany Europe $9B $2B Redalpine Venture Partners, Earlybird Venture ... 2013
116 117 Personio 2021-01-19 Internet software & services Munich Germany Europe $6B $524M Global Founders Capital, Nortzone Ventures, Pi... 2015
In [11]:
def convert_valuation(valuation_str):
    # Remove the '$' signs from the string
    valuation_num = valuation_str.replace('$', ' ')
    
    # Extract the numerical part of the string
    numeric_part = valuation_num[:-1]
    
    # Get the last character to determine the scale (Billion or Million)
    scale = valuation_num[-1]
    
    # Multiply the numeric value based on the scale
    if  scale == 'B':
        valuation_float = float(numeric_part) * 1e9
    elif scale == 'M':
        valuation_float = float(numeric_part) * 1e6
    else:
        raise ValueError('Invalid scale: {}', format(scale))
        
    # Add back the '$' sign and return as a Decimal object
    return float(valuation_float)

# Apply the custom function to the "Funding" column and create a new column 'Funding Decimal'
uc['Valuation Decimal'] = uc['Valuation'].apply(convert_valuation)
    
uc.head(3)
Out[11]:
ID Company Date Joined Industry City Country Continent Valuation Funding Select Investors Year Founded Valuation Decimal
0 1 Bytedance 2017-04-07 Artificial Intelligence Beijing China Asia $180B $8B Sequoia Capital China, SIG Asia Investments, S... 2012 1.800000e+11
1 2 SpaceX 2012-12-01 Other Hawthorne United States North America $100B $7B Founders Fund, Draper Fisher Jurvetson, Rothen... 2002 1.000000e+11
2 3 SHEIN 2018-07-03 E-commerce & direct-to-consumer Shenzhen China Asia $100B $2B Tiger Global Management, Sequoia Capital China... 2008 1.000000e+11
In [12]:
# Define a custom function to convert the string to Decimal
def convert_funding(funding_str):
    # Remove the '$' signs from the string
    funding_num = funding_str.replace('$', ' ')
    
    # Extract the numerical part of the string
    numeric_part = funding_num[:-1]
    
    # Get the last character to determine the scale (Billion or Million)
    scale = funding_num[-1]
    
    # Multiply the numeric value based on the scale
    if  scale == 'B':
        funding_float = float(numeric_part) * 1e9
    elif scale == 'M':
        funding_float = float(numeric_part) * 1e6
    elif scale == '0':
        funding_float = '0'
    else:
        raise ValueError('Invalid scale: {}', format(scale))
        
    # Add back the '$' sign and return as a Decimal object
    return float(funding_float)

# Apply the custom function to the "Funding" column and create a new column 'Funding Decimal'
uc['Funding Decimal'] = uc['Funding'].apply(convert_funding)
    
uc.head(3)
Out[12]:
ID Company Date Joined Industry City Country Continent Valuation Funding Select Investors Year Founded Valuation Decimal Funding Decimal
0 1 Bytedance 2017-04-07 Artificial Intelligence Beijing China Asia $180B $8B Sequoia Capital China, SIG Asia Investments, S... 2012 1.800000e+11 8.000000e+09
1 2 SpaceX 2012-12-01 Other Hawthorne United States North America $100B $7B Founders Fund, Draper Fisher Jurvetson, Rothen... 2002 1.000000e+11 7.000000e+09
2 3 SHEIN 2018-07-03 E-commerce & direct-to-consumer Shenzhen China Asia $100B $2B Tiger Global Management, Sequoia Capital China... 2008 1.000000e+11 2.000000e+09
In [13]:
uc.dtypes
Out[13]:
ID                            int64
Company                      object
Date Joined          datetime64[ns]
Industry                     object
City                         object
Country                      object
Continent                    object
Valuation                    object
Funding                      object
Select Investors             object
Year Founded                  int64
Valuation Decimal           float64
Funding Decimal             float64
dtype: object
In [14]:
uc.drop(columns= ['Valuation', 'Funding'], inplace= True)
uc.dtypes
Out[14]:
ID                            int64
Company                      object
Date Joined          datetime64[ns]
Industry                     object
City                         object
Country                      object
Continent                    object
Select Investors             object
Year Founded                  int64
Valuation Decimal           float64
Funding Decimal             float64
dtype: object
In [15]:
# Rename the 'Valuation Decimal' and 'Funding Decimal' back to 'Valuation' and 'Funding'
uc.rename(columns={'Valuation Decimal' : 'Valuation'}, inplace= True)
uc.dtypes
Out[15]:
ID                           int64
Company                     object
Date Joined         datetime64[ns]
Industry                    object
City                        object
Country                     object
Continent                   object
Select Investors            object
Year Founded                 int64
Valuation                  float64
Funding Decimal            float64
dtype: object
In [16]:
uc.rename(columns={'Funding Decimal' : 'Funding'}, inplace= True)
uc.dtypes
Out[16]:
ID                           int64
Company                     object
Date Joined         datetime64[ns]
Industry                    object
City                        object
Country                     object
Continent                   object
Select Investors            object
Year Founded                 int64
Valuation                  float64
Funding                    float64
dtype: object
In [17]:
# Change the position of the column 'Valuation' back to where it was initially
column_names= uc.columns.tolist()

# Remove the 'Valuation' column from the current position
valuation_col1 = column_names.pop(column_names.index('Valuation'))
# Insert the 'Valuation' column at the desired position
column_names.insert(1, valuation_col1)

# Reindex the dataframe with the new column order
uc = uc.reindex(columns= column_names)
    
uc.head(3)
Out[17]:
ID Valuation Company Date Joined Industry City Country Continent Select Investors Year Founded Funding
0 1 1.800000e+11 Bytedance 2017-04-07 Artificial Intelligence Beijing China Asia Sequoia Capital China, SIG Asia Investments, S... 2012 8.000000e+09
1 2 1.000000e+11 SpaceX 2012-12-01 Other Hawthorne United States North America Founders Fund, Draper Fisher Jurvetson, Rothen... 2002 7.000000e+09
2 3 1.000000e+11 SHEIN 2018-07-03 E-commerce & direct-to-consumer Shenzhen China Asia Tiger Global Management, Sequoia Capital China... 2008 2.000000e+09
In [16]:
# Recheck the data size
uc.size
Out[16]:
11814
In [18]:
# Recheck the data size
uc.shape
Out[18]:
(1074, 11)
In [19]:
# Recheck the data columns
uc.columns
Out[19]:
Index(['ID', 'Valuation', 'Company', 'Date Joined', 'Industry', 'City',
       'Country', 'Continent', 'Select Investors', 'Year Founded', 'Funding'],
      dtype='object')
In [21]:
new_size = uc.groupby(['Continent', 'Country']).size()
new_size
Out[21]:
Continent      Country             
Africa         Nigeria                   1
               Senegal                   1
               South Africa              1
Asia           China                   173
               Hong Kong                 6
               India                    65
               Indonesia                 6
               Israel                   20
               Japan                     5
               Malaysia                  1
               Philippines               2
               Singapore                12
               South Africa              1
               South Korea              12
               Thailand                  2
               United Arab Emirates      3
               Vietnam                   2
Europe         Austria                   2
               Belgium                   3
               Croatia                   1
               Czech Republic            1
               Denmark                   2
               Estonia                   2
               Finland                   4
               France                   24
               Germany                  26
               Ireland                   5
               Italy                     1
               Lithuania                 1
               Luxembourg                1
               Netherlands               6
               Norway                    4
               Spain                     3
               Sweden                    6
               Switzerland               5
               Turkey                    3
               United Kingdom           43
North America  Bahamas                   1
               Bermuda                   1
               Canada                   19
               Mexico                    6
               United States           562
Oceania        Australia                 8
South America  Argentina                 1
               Brazil                   16
               Chile                     2
               Colombia                  2
dtype: int64
In [22]:
uc.Continent.unique()
Out[22]:
array(['Asia', 'North America', 'Europe', 'Oceania', 'South America',
       'Africa'], dtype=object)
In [23]:
uc.Continent.value_counts()
Out[23]:
North America    589
Asia             310
Europe           143
South America     21
Oceania            8
Africa             3
Name: Continent, dtype: int64

For Further Study of Each Column DataType, Value Counts and Correlation You Can See PRofile Report In The Beginning¶

KPI'S¶

In [25]:
# Total Valuation of Unicorn Companies
valuation_sum = uc['Valuation'].sum()
new_valuation = '${:,.3f}.T'.format(valuation_sum/ 10**12)
new_valuation
Out[25]:
'$3.711.T'
In [26]:
# Total Number of Unicorn Continents
continents = uc.Continent.nunique()
continents
Out[26]:
6
In [27]:
# Total Number of Unicorn Continents
Countries = uc.Country.nunique()
Countries
Out[27]:
46
In [28]:
# Total Amount Received by Unicorn Companies
funding_sum = uc['Funding'].sum()
new_funding = '${:,.3f}.B'.format(funding_sum/ 10**9)
new_funding
Out[28]:
'$591.820.B'
In [29]:
# Total Valuation of Unicorn Companies
valuation_sum = uc['Valuation'].sum()
new_valuation = '${:,.3f}.T'.format(valuation_sum/ 10**12)

# Total Number of Unicorn Companies
companies = uc.Company.nunique() - -1

# Total Number of Unicorn Industries
industries = uc.Industry.nunique()

# Total Number of Unicorn Continents
continents = uc.Continent.nunique()

# Total No of Unicorn Countries
countries = uc.Country.nunique()

# Total Number of Unicorn Cities
cities = uc.City.nunique()

# Total Amount Received by Unicorn Companies
funding_sum = uc['Funding'].sum()
new_funding = '${:,.3f}.B'.format(funding_sum/ 10**9)

# Print Functions
print('The Total Valuation of Unicorns is: ', new_valuation)
print('The Total No. of Unicorn Companies is: ', companies)
print('The Total No. of Unicorn Industries is: ', industries)
print('The Total No. of Unicorn Continents is: ', continents)
print('The Total No. of Unicorn Countries is: ', countries)
print('The Total No. of Unicorn Cities is: ', cities)
print('The Total Funding received by Unicorns is: ', new_funding)
The Total Valuation of Unicorns is:  $3.711.T
The Total No. of Unicorn Companies is:  1074
The Total No. of Unicorn Industries is:  15
The Total No. of Unicorn Continents is:  6
The Total No. of Unicorn Countries is:  46
The Total No. of Unicorn Cities is:  257
The Total Funding received by Unicorns is:  $591.820.B
In [18]:
# Step 1: Find the maximum funding value
max_funding = uc['Funding'].max()

# Step 2: Locate the company associated with this maximum funding
max_funding_companies = uc[uc['Funding'] == max_funding]['Company'].tolist()

# Step 3: Convert the funding value to a scalable format
def format_funding(value):
    if value >= 1e9:
        return f"{value / 1e9:.2f} billion"
    elif value >= 1e6:
        return f"{value / 1e6:.2f} million"
    else:
        return f"{value:.2f}"

max_funding_formatted = format_funding(max_funding)

# Step 4: Print the results
for company in max_funding_companies:
    print(f"Company: {company}, Funding: {max_funding_formatted}")
Company: JUUL Labs, Funding: 14.00 billion
In [39]:
# Step 1: Find the maximum funding value
max_valuation = uc['Valuation'].max()

# Step 2: Locate the company associated with this maximum funding
max_valuation_companies = uc[uc['Valuation'] == max_valuation]['Company'].tolist()

# Step 3: Convert the funding value to a scalable format
def format_valuation(value):
    if value >= 1e9:
        return f"{value / 1e9:.2f} billion"
    elif value >= 1e6:
        return f"{value / 1e6:.2f} million"
    else:
        return f"{value:.2f}"

max_valuation_formatted = format_valuation(max_valuation)

# Step 4: Print the results
for company in max_valuation_companies:
    print(f"Company: {company}, Valuation: {max_valuation_formatted}")
Company: Bytedance, Valuation: 180.00 billion

Uni-Variative Analysis¶

In [19]:
import pandas as pd
import plotly.graph_objects as go

# Calculate the total valuation by industry
industry_total_val = uc[["Industry", "Valuation"]].groupby(by="Industry").sum()

# Ensure the resulting DataFrame is not empty
if not industry_total_val.empty:
    # Define colors for the pie chart
    colors = [
        '#636EFA', '#EF553B', '#00CC96', '#AB63FA', '#FFA15A',
        '#19D3F3', '#FF6692', '#B6E880', '#FF97FF', '#FECB52'
        # Add more colors if needed
    ]

    # Create the pie chart
    fig3 = go.Figure()
    fig3.add_trace(go.Pie(labels=industry_total_val.index,
                          values=industry_total_val["Valuation"],
                          marker=dict(colors=colors)))

    # Update the layout
    fig3.update_layout(title='Industries distribution by total valuation')

    # Show the figure
    fig3.show()
else:
    print("The DataFrame is empty. Please check the input data.")
In [18]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go

# Filter out rows where 'Year Founded' is NaN
uc_with_fyear = uc[~uc['Year Founded'].isna()]

# Count the number of occurrences for each 'Year Founded'
num_by_founded_year = uc_with_fyear["Year Founded"].value_counts().reset_index()
num_by_founded_year.columns = ['Year Founded', 'Count']
num_by_founded_year["Year Founded"] = num_by_founded_year["Year Founded"].astype(np.int64)
num_by_founded_year = num_by_founded_year[num_by_founded_year["Year Founded"] >= 1990]
num_by_founded_year.sort_values(by=["Year Founded"], inplace=True)
years = pd.DataFrame({"years": num_by_founded_year["Year Founded"]})

# Create the initial figure
fig1 = go.Figure()
fig1.add_trace(go.Scatter(x=num_by_founded_year["Year Founded"], y=num_by_founded_year["Count"],
                          mode='lines',
                          name='lines'))

# Create update menu buttons
updatemenu = []
buttons = []

# Button for all industries
buttons.append(dict(method='update',
                    label="All industries",
                    visible=True,
                    args=[{'y': [num_by_founded_year["Count"]],
                           'x': [num_by_founded_year["Year Founded"]],
                           'type': 'scatter'},
                          {'title': "Number of unicorns in All industries since 1990"}],
                    ))

# Buttons for each industry
for indst in uc_with_fyear.Industry.unique():
    selected_industry = uc_with_fyear[uc_with_fyear["Industry"] == indst]
    temp_vc = selected_industry["Year Founded"].value_counts().reset_index()
    temp_vc.columns = ['Year Founded', 'Count']
    temp_vc["Year Founded"] = temp_vc["Year Founded"].astype(np.int64)
    temp_vc.sort_values(by=["Year Founded"], inplace=True)
    result = years.set_index('years').join(temp_vc.set_index('Year Founded'), how='left').fillna(0)
    result["Count"] = result["Count"].astype(int)
    buttons.append(dict(method='update',
                        label=indst,
                        visible=True,
                        args=[{'y': [result["Count"]],
                               'x': [years["years"]],
                               'type': 'scatter'},
                              {'title': f"Number of unicorns in {indst} since 1990"}],
                        ))

# Update the layout with the buttons
updatemenu = [dict()]

updatemenu[0]['buttons'] = buttons
updatemenu[0]['direction'] = 'down'
updatemenu[0]['showactive'] = True

# Update the figure layout
fig1.update_layout(showlegend=False, updatemenus=updatemenu)
fig1.update_layout(
    title="Number of unicorns in All industries since 1990",
    xaxis_title='Founded Year',
    yaxis_title='Unicorns count')

# Show the figure
fig1.show()
In [35]:
# The Decriptive Statistics of the Numerical Columns

int_columns = uc['Year Founded'].describe().astype(int)
float_columns = uc[['Valuation','Funding']].describe().astype(float)

# Concatenate/ join the'Year Funded' to show as int, and 'Valuation' and 'Funding' to show as float                                                        
joined_columns = pd.concat([float_columns, int_columns], axis= 1)
joined_columns
Out[35]:
Valuation Funding Year Founded
count 1.074000e+03 1.074000e+03 1074
mean 3.455307e+09 5.510428e+08 2012
std 8.547022e+09 8.077194e+08 5
min 1.000000e+09 0.000000e+00 1919
25% 1.000000e+09 2.180000e+08 2011
50% 2.000000e+09 3.650000e+08 2014
75% 3.000000e+09 6.030000e+08 2016
max 1.800000e+11 1.400000e+10 2021

Summary of Unicorn Company Data¶

Valuation:

  • Mean: 3.46 billion USD
  • Standard Deviation: High variability
  • Minimum: 1 billion USD
  • Maximum: 180 billion USD
  • Interquartile Range (IQR): 1 billion USD to 3 billion USD

Funding:

  • Mean: 551 million USD
  • Standard Deviation: High variability
  • Minimum: 0 USD
  • Maximum: 14 billion USD
  • IQR: 218 million USD to 603 million USD

Year Founded:

  • Mean: 2012
  • Standard Deviation: 5 years
  • Earliest: 1919
  • Most Recent: 2021
  • IQR: 2011 to 2016

Summary:

  • Unicorn companies have high valuations and attract substantial investment.
  • Significant variability exists in valuations and funding amounts.
  • Most unicorns are relatively young, with many founded in the last decade.
In [42]:
# Step 1: Identify the top 10 companies by their total valuation
top10_companies = uc.groupby('Company')['Valuation'].sum().sort_values(ascending=False).head(10).reset_index(name='Valuation')

# Step 2: Scale the valuations to billions
top10_companies['Valuation (in billions)'] = top10_companies['Valuation'] / 1e9

# Display the top 10 companies with scaled valuations
top10_companies[['Company', 'Valuation (in billions)']]
Out[42]:
Company Valuation (in billions)
0 Bytedance 180.0
1 SpaceX 100.0
2 SHEIN 100.0
3 Stripe 95.0
4 Klarna 46.0
5 Checkout.com 40.0
6 Canva 40.0
7 Instacart 39.0
8 JUUL Labs 38.0
9 Databricks 38.0
In [44]:
# Create a figure and axis
fig, ax = plt.subplots(figsize=(12, 8))

# Create the horizontal barplot with a custom color palette
palette = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
sns.barplot(x="Valuation", y="Company", data=top10_companies, palette=palette, ax=ax)

# Add 'Valuation' labels to the bars
for index, row in top10_companies.iterrows():
    valuation = row["Valuation"]
    if valuation >= 1e9:
        label = f"${valuation/1e9:.1f}B"
    else:
        label = f"${valuation/1e6:.1f}M"
    ax.text(valuation, index, label, ha="left", va="center", fontsize=10, color="black", fontweight='bold')

# Customize the plot
ax.set(xlabel="Total Valuation (in USD)", ylabel="Company")
plt.title("Top 10 Most Valuable Unicorn Companies", fontsize=16, fontweight='bold')
plt.xticks(rotation=0)
plt.grid(axis='x', linestyle='--', alpha=0.7)

# Improve layout
plt.tight_layout()

# Show the plot
plt.show()
In [49]:
top9_companyF = uc.groupby('Company')['Funding'].sum().sort_values(ascending=False).head(9).reset_index()

# Step 2: Scale the funding values to billions
top9_companyF['Funding (in billions)'] = top9_companyF['Funding'] / 1e9

# Display the top 9 companies with scaled funding values
top9_companyF[['Company', 'Funding (in billions)']]
Out[49]:
Company Funding (in billions)
0 JUUL Labs 14.0
1 Bytedance 8.0
2 Epic Games 7.0
3 SpaceX 7.0
4 Global Switch 5.0
5 Xingsheng Selected 5.0
6 Swiggy 5.0
7 J&T Express 5.0
8 BYJU's 4.0
In [51]:
fig, ax = plt.subplots(figsize=(12, 8))

# Create the horizontal barplot with a custom color palette
palette = sns.color_palette("viridis", 8)
sns.barplot(x="Funding (in billions)", y="Company", data=top9_companyF, palette=palette, ax=ax)

# Add 'Funding' labels to the bar
for index, row in top9_companyF.iterrows():
    funding = row["Funding (in billions)"]
    label = f"${funding:.1f}B"
    ax.text(funding, index, label, ha="left", va="center", fontsize=10, color="black", fontweight='bold')

# Customize the plot
ax.set(xlabel="Total Funding (in billions USD)", ylabel="Company")
plt.title("Top 9  Companies With The Highest Funding", fontsize=16, fontweight='bold')
plt.xticks(rotation=0)
plt.grid(axis='x', linestyle='--', alpha=0.7)

# Improve layout
plt.tight_layout()

# Show the plot
plt.show()
In [69]:
uc['ROI'] = (uc['Valuation'] - uc['Funding']) / uc['Funding']

# Step 2: Handle any NaN or infinite values
df = uc.replace([np.inf, -np.inf], np.nan).dropna(subset=['ROI'])

# Step 3: Identify the top companies by ROI
top10_roi = df.sort_values(by='ROI', ascending=False).head(10).reset_index()

# Step 4: Display the top 10 companies
top10_roi[['Company', 'Valuation', 'Funding', 'ROI']]
Out[69]:
Company Valuation Funding ROI
0 Zapier 4.000000e+09 1000000.0 3999.000000
1 Dunamu 9.000000e+09 71000000.0 125.760563
2 Workhuman 1.000000e+09 9000000.0 110.111111
3 CFGI 2.000000e+09 19000000.0 104.263158
4 Manner 1.000000e+09 10000000.0 99.000000
5 DJI Innovations 8.000000e+09 105000000.0 75.190476
6 GalaxySpace 1.000000e+09 14000000.0 70.428571
7 Canva 4.000000e+10 572000000.0 68.930070
8 Il Makiage 2.000000e+09 29000000.0 67.965517
9 Revolution Precrafted 1.000000e+09 15000000.0 65.666667
In [67]:
# Step 4: Create the plot
fig, ax = plt.subplots(figsize=(12, 8))

# Create the horizontal barplot with a custom color palette
palette = sns.color_palette("coolwarm", 10)
sns.barplot(x="ROI", y="Company", data=top10_roi, palette=palette, ax=ax)

# Add 'ROI' labels to the bars
for index, row in top10_roi.iterrows():
    roi = row["ROI"]
    label = f"{roi:.2f}x"
    ax.text(roi, index, label, ha="left", va="center", fontsize=10, color="black", fontweight='bold')

# Customize the plot
ax.set(xlabel="Return on Investment (ROI)", ylabel="Company")
plt.title("Top 10 Companies With The Highest ROI", fontsize=16, fontweight='bold')
plt.xticks(rotation=0)
plt.grid(axis='x', linestyle='--', alpha=0.7)

# Improve layout
plt.tight_layout()

# Show the plot
plt.show()
In [74]:
# Step 1: Group by Industry and calculate the total valuation for each industry
industry_valuation = uc.groupby('Industry')['Valuation'].sum().sort_values(ascending=False).reset_index()

# Step 2: Select the top 5 industries by valuation
top5_industries = industry_valuation.head().copy()

# Step 3: Scale the valuation values to billions for better readability
top5_industries.loc[:, 'Valuation (in billions)'] = top5_industries['Valuation'] / 1e9

# Step 4: Display the top 5 industries with their valuation
top5_industries[['Industry', 'Valuation (in billions)']]
Out[74]:
Industry Valuation (in billions)
0 Fintech 882.0
1 Internet software & services 595.0
2 E-commerce & direct-to-consumer 426.0
3 Artificial Intelligence 377.0
4 Other 252.0
In [75]:
# Step 4: Create the pie chart
fig, ax = plt.subplots(figsize=(10, 8))

# Create the pie chart with a custom color palette
colors = sns.color_palette("viridis", len(top5_industries))
wedges, texts, autotexts = ax.pie(
    top5_industries['Valuation (in billions)'],
    labels=top5_industries['Industry'],
    colors=colors,
    autopct='%1.1f%%',
    startangle=140,
    pctdistance=0.85,
    wedgeprops={'linewidth': 1, 'edgecolor': 'white'}
)

# Customize the text properties
for text in texts:
    text.set_color('black')
    text.set_fontsize(12)
for autotext in autotexts:
    autotext.set_color('white')
    autotext.set_fontsize(10)
    autotext.set_fontweight('bold')

# Draw a circle at the center of the pie to make it a donut chart
centre_circle = plt.Circle((0, 0), 0.70, fc='white')
fig.gca().add_artist(centre_circle)

# Equal aspect ratio ensures that pie is drawn as a circle.
ax.axis('equal')

plt.title("Top 5 Unicorn Industries by Valuation", fontsize=16, fontweight='bold')
plt.show()
In [76]:
# The Top 5 Countries with the Most Number of Unicorns
top5_countries = uc.groupby('Country')['Company'].size().sort_values(ascending= False).head()
top5_countries
Out[76]:
Country
United States     562
China             173
India              65
United Kingdom     43
Germany            26
Name: Company, dtype: int64
In [77]:
top5_countries = pd.Series({
    'USA': 400,
    'China': 300,
    'India': 100,
    'UK': 80,
    'Germany': 60
})

# Extract the data for the pie chart
countries = top5_countries.index
company_counts = top5_countries.values

# Define the explode parameter to separate each slice
explode = [0.05] * len(countries)

# Create the pie chart with percentage labels inside and black labels outside
plt.figure(figsize=(10, 8))  # Adjust the figure size as needed

wedges, texts, autotexts = plt.pie(
    company_counts,
    labels=countries,
    autopct='%1.1f%%',
    pctdistance=0.85,  # Adjust the percentage label position inside the pie
    explode=explode,  # Separate each portion slightly
    startangle=60,  # Start angle to rotate the pie chart
    wedgeprops={'edgecolor': 'black', 'linewidth': 1}  # Control edge color and width
)

# Customize the labels inside and outside the pie chart
for text, autotext in zip(texts, autotexts):
    text.set(color='black', fontsize=14, fontweight='bold')
    autotext.set(color='white', fontsize=12, fontweight='bold')

# Add a title
plt.title('Top 5 Countries With Most Number of Unicorn Companies', fontsize=16, fontweight='bold')

# Display the pie chart
plt.axis('equal')  # Equal aspect ratio ensures that the pie is drawn as a circle.
plt.show()
In [78]:
# The Top 5 Cities with the higest number of Unicorn Companies
top5_cities = uc.groupby(['City'])['Company'].size().sort_values(ascending=False).head(5)
top5_cities
Out[78]:
City
San Francisco    152
New York         103
Beijing           63
Shanghai          44
London            34
Name: Company, dtype: int64
In [85]:
# Create a horizontal barplot to show the Top 10 Cities with the highest concentration of Unicorn Companies 
plt.figure(figsize=(10, 6))  # Adjust the figure size as needed
sns.set_style("white")
custom_palette = sns.color_palette = ['#08306b', '#08519c', '#6F8FAF', '#0096FF', '#2171b5']

plt.figure(figsize=(12, 6))
sns.barplot(x='City', y='Company', data=top5_cities.reset_index(), palette= custom_palette, dodge=False)

# Add labels for the number of cities for each bar
for i in range(len(top5_cities)):
    plt.text(x=i, y=top5_cities.iloc[i]+1, s=top5_cities.iloc[i], ha='center', fontsize=11, fontweight= 'bold')

plt.title('Top 5 Cities with the Most Concentration of Unicorn Companies', fontsize=15, fontweight='bold')
plt.xlabel('Unicorn Cities', fontsize=10, fontweight= 'bold')
plt.ylabel('No. of Unicorn Companies', fontsize=10, fontweight= 'bold')

plt.show()
<Figure size 1000x600 with 0 Axes>
In [86]:
# Unicorn Companies Distribution Across Continent
unicorn_cont = uc.groupby('Continent')['Company'].size().sort_values(ascending= False).head(10)
unicorn_cont
Out[86]:
Continent
North America    589
Asia             310
Europe           143
South America     21
Oceania            8
Africa             3
Name: Company, dtype: int64
In [100]:
# Sample data for demonstration (replace with your actual data)
continents = unicorn_cont.index
company_counts = unicorn_cont.values

# Create a countplot
plt.figure(figsize=(10, 6))
sns.barplot(x=continents, y=company_counts, palette=['#08306b', '#0096FF', '#2171b5', '#08306b', '#08519c', '#6F8FAF', '#0096FF', '#2171b5', '#08306b', '#08519c'])
plt.title('Distribution of Unicorn Companies Across Continents')
plt.xlabel('Continent')
plt.ylabel('Number of Companies')

plt.show()
In [102]:
# Unicorn Companies Distribution across Industries
company_spread = uc.groupby('Industry')['Company'].size().sort_values(ascending= False).reset_index(name= 'Total Companies')
company_spread
Out[102]:
Industry Total Companies
0 Fintech 224
1 Internet software & services 205
2 E-commerce & direct-to-consumer 111
3 Artificial Intelligence 84
4 Health 74
5 Other 58
6 Supply chain, logistics, & delivery 57
7 Cybersecurity 50
8 Data management & analytics 41
9 Mobile & telecommunications 38
10 Hardware 34
11 Auto & transportation 31
12 Edtech 28
13 Consumer & retail 25
14 Travel 14
In [107]:
# Create a bar plot
plt.figure(figsize=(10, 6))
sns.barplot(x='Total Companies', y='Industry', data=company_spread, palette='cubehelix', alpha=1.0)
plt.title('Unicorn Companies Distribution across Industries')
plt.xlabel('Total Companies')
plt.ylabel('Industry')

plt.show()
In [20]:
# The Top 10 Select Investors that have invested the highest funds in Unicorn Companies

def format_funding(funding):
    if funding >= 1000000000:
        return f'${funding/1000000000:.1f}B'
    elif funding >= 1000000:
        return f'${funding/1000000:.1f}M'
    else:
        return f'${funding}'

top10_investors = uc.groupby('Select Investors')['Funding'].sum().sort_values(ascending=False).head(10)
top10_investors = top10_investors.apply(format_funding)
top10_investors
Out[20]:
Select Investors
Tiger Global Management                                                               $14.0B
Sequoia Capital China, SIG Asia Investments, Sina Weibo, Softbank Group                $8.0B
Tencent Holdings, KKR, Smash Ventures                                                  $7.0B
Founders Fund, Draper Fisher Jurvetson, Rothenberg Ventures                            $7.0B
Aviation Industry Corporation of China, Essence Financial, Jiangsu Sha Steel Group     $5.0B
Accel India, SAIF Partners, Norwest Venture Partners                                   $5.0B
KKR, Tencent Holdings, Sequoia Capital China                                           $5.0B
Hillhouse Capital Management, Boyu Capital, Sequoia Capital China                      $5.0B
Baidu Capital, Linear Venture, Tencent                                                 $4.0B
Tencent Holdings, Warbug Pincus, IDG Capital                                           $4.0B
Name: Funding, dtype: object
In [113]:
investors_stacked = uc['Select Investors'].str.split(', ', expand=True).stack().reset_index(level=1, drop=True).rename('Investors')

# Step 2: Count the occurrences of each investor
investor_counts = investors_stacked.value_counts().head(10)

investor_counts
Accel                          60
Tiger Global Management        53
Andreessen Horowitz            53
Sequoia Capital China          48
Insight Partners               47
Sequoia Capital                47
Lightspeed Venture Partners    34
SoftBank Group                 34
General Catalyst               33
Index Ventures                 32
Name: Investors, dtype: int64
In [116]:
# Step 4: Create the pie chart
plt.figure(figsize=(8, 8))
plt.pie(investor_counts, labels=investor_counts.index, autopct='%1.1f%%', startangle=140, colors=plt.cm.Paired(range(len(investor_counts))))
plt.title('Top 10 Investors in Unicorn Companies', fontsize=16, fontweight='bold', y=1.1)
Out[116]:
Text(0.5, 1.1, 'Top 10 Investors in Unicorn Companies')
In [21]:
def format_funding(funding):
    if funding >= 1e9:
        return f'${funding / 1e9:.1f}B'
    elif funding >= 1e6:
        return f'${funding / 1e6:.1f}M'
    else:
        return f'${funding}'

# Calculate the total valuation for each country
top10_countries = uc.groupby('Country')['Valuation'].sum().sort_values(ascending=False).head(10)

# Format the valuation values
top10_countries_formatted = top10_countries.apply(format_funding)

# Display the top 10 countries with their formatted valuation
print(top10_countries_formatted)
Country
United States     $1933.0B
China              $696.0B
India              $196.0B
United Kingdom     $195.0B
Germany             $72.0B
Sweden              $63.0B
Australia           $56.0B
France              $55.0B
Canada              $49.0B
South Korea         $41.0B
Name: Valuation, dtype: object
In [128]:
# Create a bar plot
plt.figure(figsize=(12, 6))
top10_countries.plot(kind='bar', color=['#6F8FAF', '#0096FF', '#2171b5', '#08306b', '#08519c', '#6F8FAF', '#0096FF', '#2171b5', '#08306b', '#08519c'])
plt.title('Top 10 Countries with the Highest Unicorn Valuation', fontsize=16, fontweight='bold')
plt.xlabel('Country', fontsize=14)
plt.ylabel('Total Valuation (in USD)', fontsize=14)
plt.xticks(rotation=0, ha='right', fontsize=12)
plt.tight_layout()
plt.show()
In [129]:
# Unicorn Companies Trend By Year Joined
total_companies = uc.groupby('Date Joined')['Company'].size().reset_index(name= 'Total Companies')
total_companies
Out[129]:
Date Joined Total Companies
0 2007-07-02 1
1 2011-04-02 1
2 2011-12-12 1
3 2012-02-13 1
4 2012-06-06 1
... ... ...
634 2022-03-23 3
635 2022-03-29 2
636 2022-03-30 2
637 2022-03-31 1
638 2022-04-05 3

639 rows × 2 columns

In [140]:
 #Convert 'Date Joined' to datetime format
uc['Date Joined'] = pd.to_datetime(uc['Date Joined'])

# Extract year from the 'Date Joined' column
uc['Year'] = uc['Date Joined'].dt.year

# Sum the total companies by the year they joined
total_companies_by_year = uc.groupby('Year')['Company'].size().reset_index(name='Total Companies')

# Create a line plot
plt.figure(figsize=(12, 6))  # Adjust the figure size as needed
sns.set_style("white")

# Create the line plot
ax = sns.lineplot(x='Year', y='Total Companies', data=total_companies_by_year, marker='o', color='darkred', label='Total No. of Unicorn Companies')

# Add labels with the count on top of each data point
for index, row in total_companies_by_year.iterrows():
    ax.text(row['Year'], row['Total Companies'], f'{row["Total Companies"]}', ha='center', va='bottom', fontsize=10, color='black', fontweight='bold')

# Customize the plot
ax.set(xlabel="Year Joined", ylabel="Total Companies")
plt.title("Unicorn Companies Trend By Year Joined", fontsize=13, fontweight='bold')

# Hide the grid lines
ax.grid(False)

# Show the plot
plt.tight_layout()
plt.show()
In [143]:
companies_by_year = uc.groupby('Year Founded')['Company'].nunique().reset_index(name='Total Companies')
companies_by_year
Out[143]:
Year Founded Total Companies
0 1919 1
1 1979 1
2 1984 1
3 1990 1
4 1991 1
5 1992 1
6 1993 1
7 1994 2
8 1995 2
9 1996 1
10 1997 1
11 1998 5
12 1999 8
13 2000 11
14 2001 9
15 2002 4
16 2003 8
17 2004 8
18 2005 14
19 2006 15
20 2007 24
21 2008 27
22 2009 34
23 2010 40
24 2011 82
25 2012 95
26 2013 87
27 2014 109
28 2015 155
29 2016 110
30 2017 74
31 2018 61
32 2019 45
33 2020 25
34 2021 11
In [34]:
# Multivariate
# Unicorn Companies Distribution across Cities, Countries and Continents
top_cities = uc.groupby(['City', 'Country', 'Continent'])['Company'].nunique().reset_index(name='Total Companies')
top_cities.head(11)
Out[34]:
City Country Continent Total Companies
0 Aarhus Denmark Europe 1
1 Aberdeen United Kingdom Europe 1
2 Alameda United States North America 1
3 Alexandria Australia Oceania 1
4 Altrincham United Kingdom Europe 1
5 Ambler United States North America 1
6 Amsterdam Netherlands Europe 6
7 Andheri India Asia 1
8 Arlington United States North America 1
9 Atlanta United States North America 7
10 Austin United States North America 8
In [35]:
import plotly.express as px
# Create the scatter plot
fig = px.scatter(
    top_cities, 
    x='City', 
    y='Total Companies', 
    color='Continent', 
    size='Total Companies', 
    hover_name='City',
    title='Unicorn Companies Distribution across Cities, Countries, and Continents',
    labels={'Total Companies': 'Total Companies'}
)

# Show the plot
fig.show()
In [22]:
# Multivariate
# Unicorn Companies Valuation Spread over the Years
def format_funding(funding):
    if funding >= 1000000000:
        return f'${funding/1000000000:.1f}B'
    elif funding >= 1000000:
        return f'${funding/1000000:.1f}M'
    else:
        return f'${funding}'
    
unicorn_val = uc.groupby('Date Joined')['Valuation'].sum().sort_values(ascending= True)
unicorn_val = unicorn_val.apply(format_funding)
unicorn_val
Out[22]:
Date Joined
2007-07-02      $1.0B
2018-04-10      $1.0B
2020-12-03      $1.0B
2020-11-12      $1.0B
2018-05-03      $1.0B
               ...   
2021-07-20     $49.0B
2014-01-23     $95.0B
2012-12-01    $100.0B
2018-07-03    $100.0B
2017-04-07    $180.0B
Name: Valuation, Length: 639, dtype: object
In [23]:
# Function to format the valuation
def format_funding(funding):
    if funding >= 1e9:
        return f'${funding/1e9:.1f}B'
    elif funding >= 1e6:
        return f'${funding/1e6:.1f}M'
    else:
        return f'${funding:.0f}'

# Group by 'Industry' and sum the 'Valuation', then get the 5 industries with the least valuation
least5_ind = uc.groupby('Industry')['Valuation'].sum().sort_values(ascending=True).head(5)

# Apply the formatting function
least5_ind_formatted = least5_ind.apply(format_funding)

# Create a DataFrame for better handling in plotting
least5_ind_df = least5_ind_formatted.reset_index()
least5_ind_df.columns = ['Industry', 'Valuation']
least5_ind_df
Out[23]:
Industry Valuation
0 Travel $46.0B
1 Mobile & telecommunications $89.0B
2 Auto & transportation $99.0B
3 Hardware $99.0B
4 Edtech $100.0B
In [24]:
# Convert 'Valuation' to numeric format
least5_ind_df['Valuation'] = pd.to_numeric(least5_ind_df['Valuation'].replace('[\$,]', '', regex=True), errors='coerce')

# Drop rows with NaN values in 'Valuation'
least5_ind_df = least5_ind_df.dropna(subset=['Valuation'])

# Convert valuations to billions
least5_ind_df['Valuation'] = least5_ind_df['Valuation'] / 1e9

# Display the resulting dataframe
print(least5_ind_df)
Empty DataFrame
Columns: [Industry, Valuation]
Index: []
In [25]:
import plotly.express as px

# Create a treemap
fig = px.treemap(
    least5_ind_df, 
    path=['Industry'], 
    values='Valuation',
    color='Valuation',
    color_continuous_scale='RdYlGn',  # Changed color scale to 'RdYlGn'
    title='Top 10 Industries with the Least Valuation'
)

# Update the layout for better appearance
fig.update_layout(
    title=dict(
        text='Top 10 Industries with the Least Valuation',
        x=0.5,  # Center the title
        xanchor='center',
        font=dict(size=24, family='Arial', color='black')
    ),
    font=dict(size=14, family='Arial'),
    coloraxis_colorbar=dict(
        title='Valuation (in Billions)',
        titleside='right',
        titlefont=dict(size=16, family='Arial'),
        tickfont=dict(size=14, family='Arial')
    )
)

# Format the values in the treemap for better readability
fig.data[0].texttemplate = "%{label}<br>Valuation: $%{value:.2f}B"

# Show the treemap
fig.show()
In [26]:
# Group by 'Company' and sum the 'Valuation'
top7_companies = uc.groupby('Company')['Valuation'].sum().sort_values(ascending=False).head(7)

# Reset the index to create a new default integer-based index
top7_companies_reset = top7_companies.reset_index()

# Convert 'Valuation' to numeric format
top7_companies_reset['Valuation'] = pd.to_numeric(top7_companies_reset['Valuation'].replace('[\$,]', '', regex=True), errors='coerce')

# Drop rows with NaN values in 'Valuation'
top7_companies_reset = top7_companies_reset.dropna(subset=['Valuation'])

# Convert valuations to billions
top7_companies_reset['Valuation'] = top7_companies_reset['Valuation'] / 1e9

# Create a treemap
fig = px.treemap(
    top7_companies_reset, 
    path=['Company'], 
    values='Valuation',
    color='Company',  # Use 'Company' column to assign different colors
    title='Top 7 Companies by Valuation'
)

# Update the layout for better appearance
fig.update_layout(
    title=dict(
        text='Top 7 Companies by Valuation',
        x=0.5,  # Center the title
        xanchor='center',
        font=dict(size=24, family='Arial', color='black')
    ),
    font=dict(size=14, family='Arial')
)

# Format the values in the treemap for better readability
fig.data[0].texttemplate = "%{label}<br>Valuation: $%{value:.2f}B"

# Show the treemap
fig.show()
In [28]:
from plotly.subplots import make_subplots

#Group by 'Company' and sum the 'Valuation'
top7_companies = uc.groupby('Company')['Valuation'].sum().sort_values(ascending=True).head(7)

# Reset the index to create a new default integer-based index
top7_companies_reset = top7_companies.reset_index()

# Convert 'Valuation' to numeric format
top7_companies_reset['Valuation'] = pd.to_numeric(top7_companies_reset['Valuation'].replace('[\$,]', '', regex=True), errors='coerce')

# Drop rows with NaN values in 'Valuation'
top7_companies_reset = top7_companies_reset.dropna(subset=['Valuation'])

# Convert valuations to billions
top7_companies_reset['Valuation'] = top7_companies_reset['Valuation'] / 1e9

# Create a treemap
fig = px.treemap(
    top7_companies_reset, 
    path=['Company'], 
    values='Valuation',
    color='Company',  # Use 'Company' column to assign different colors
    title='Top 7 Companies by Valuation'
)

# Update the layout for better appearance
fig.update_layout(
    title=dict(
        text='Top 7 Companies by Valuation',
        x=0.5,  # Center the title
        xanchor='center',
        font=dict(size=24, family='Arial', color='black')
    ),
    font=dict(size=14, family='Arial')
)

# Format the values in the treemap for better readability
fig.data[0].texttemplate = "%{label}<br>Valuation: $%{value:.2f}B"

# Show the treemap
fig.show()
In [27]:
import pandas as pd
import plotly.express as px

# Convert "Valuation" column to numeric
uc['Valuation'] = pd.to_numeric(uc['Valuation'], errors='coerce')

# Drop rows with missing values in "Valuation" column
uc = uc.dropna(subset=['Valuation'])

# Ensure 'Investors Count' exists; if not, compute it as a placeholder
if 'Investors Count' not in uc.columns:
    uc['Investors Count'] = uc['Select Investors'].apply(lambda x: len(str(x).split(',')))

# Get the top 20 companies by valuation
top_20_companies = uc.nlargest(20, 'Valuation')

# Create the scatter plot using Plotly Express
fig4 = px.scatter(top_20_companies, x="Valuation", y="Funding",
                  size="Investors Count", 
                  color="Industry",
                  hover_name="Company", size_max=60, title="Top 20 companies")

# Update the layout to include a legend title
fig4.update_layout(
    legend=dict(
        title="Industry"
    )
)

# Show the figure
fig4.show()
In [43]:
# Show the figure if data exists, otherwise print a message
if not top_20_companies.empty:
    fig4.show()
else:
    print("No data available to plot.")
No data available to plot.
In [29]:
import pycountry
import pandas as pd
import plotly.express as px

# Calculate the total valuation by country
top_10_countries = uc[["Country", "Valuation"]].groupby(by="Country").sum()
top_10_countries = top_10_countries.sort_values(by="Valuation", ascending=False)[:10].reset_index()
top_10_countries_total_valuation = top_10_countries["Valuation"].sum()
top_10_countries_total_valuation_perc = top_10_countries_total_valuation * 100 / uc["Valuation"].sum()
top_10_countries["iso_code"] = top_10_countries["Country"].apply(lambda x: pycountry.countries.lookup(x).alpha_3)

# Create the choropleth map using Plotly Express
fig7 = px.choropleth(top_10_countries, locations="iso_code", color="Valuation",
                    hover_name="Country", color_continuous_scale='sunsetdark',
                    title=f"Valuation for top 10 countries is {top_10_countries_total_valuation:.1f} B$ ({top_10_countries_total_valuation_perc:.1f}% of total)"
                   )

# Show the figure
fig7.show()
In [ ]: